package au.com.acpfg.misc.spectra; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.util.ArrayList; import java.util.logging.Logger; import org.knime.base.node.util.BufferedFileReader; import org.knime.core.data.DataCell; import org.knime.core.data.DataType; import org.knime.core.data.RowKey; import org.knime.core.data.container.DataContainer; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.DoubleCell; import org.knime.core.data.def.IntCell; import org.knime.core.data.def.StringCell; import org.knime.core.node.ExecutionContext; import org.knime.core.node.NodeLogger; import org.proteomecommons.io.Peak; import org.proteomecommons.io.GenericPeak; import org.systemsbiology.jrap.stax.DataProcessingInfo; import org.systemsbiology.jrap.stax.MSInstrumentInfo; import org.systemsbiology.jrap.stax.MSOperator; import org.systemsbiology.jrap.stax.MZXMLFileInfo; import org.systemsbiology.jrap.stax.SoftwareInfo; /** * Implements support for .mgf and .mgf.gz files using ProteomeCommons IO framework. The * only data loaded into the table (in the spectra column!) is as follows: * BEGIN IONS TITLE=The first peptide - dodgy peak detection, so extra wide tolerance PEPMASS=896.05 25674.3 CHARGE=3+ TOL=3 TOLU=Da SEQ=n-AC[DHK]s COMP=2[H]0[M]3[DE]*[K] 240.1 3 242.1 12 245.2 32 ... * query parameters, in particular are not currently supported. * * @author andrew.cassin * */ public class MGFDataProcessor extends AbstractDataProcessor { private BufferedReader m_is; private String m_filename; public MGFDataProcessor() { m_is = null; } @Override public boolean can(File f) throws Exception { m_filename = f.getName(); String ext = m_filename.toLowerCase(); return (ext.endsWith(".mgf") || ext.endsWith(".mgf.gz")); } @Override public void process(boolean load_spectra, RowSequence scan_seq, RowSequence file_seq, ExecutionContext exec, DataContainer scan_container, DataContainer file_container) throws Exception { if (m_is == null) throw new Exception("No file to load!"); String line; StringBuilder headers = new StringBuilder(10 * 1024); StringBuilder peak_list = new StringBuilder(10 * 1024); boolean got_start = false; boolean in_headers= false; int done = 0; int peaks= 0; int ncols= scan_container.getTableSpec().getNumColumns(); while ((line = m_is.readLine()) != null) { if (!got_start && line.startsWith("BEGIN IONS")) { got_start = true; in_headers = true; headers.delete(0, headers.length()); peak_list.delete(0, peak_list.length()); peaks = 0; } else if (got_start && line.startsWith("END IONS")) { got_start = false; in_headers= false; done++; if (peaks > 0) { process_spectra(headers.toString(), peak_list.toString(), peaks, scan_seq, load_spectra, ncols, scan_container); } else { NodeLogger.getLogger(SpectraReaderNodeModel.class).warn("Got spectra with no peaks!"); } if (done % 100 == 0) { exec.checkCanceled(); } } else if (got_start) { // if its a digit, we have finished the headers char c = line.charAt(0); if (Character.isDigit(c)) { in_headers = false; peaks++; peak_list.append(line); peak_list.append("\n"); } else { in_headers = true; headers.append(line); headers.append("\n"); } } } // HACK: add a largely blank file container row as the MGF will not // provide any suitable data for this table ncols = file_container.getTableSpec().getNumColumns(); DataCell[] cells = new DataCell[ncols]; assert(ncols == 9); cells[0] = DataType.getMissingCell(); cells[1] = DataType.getMissingCell(); cells[2] = DataType.getMissingCell(); cells[3] = DataType.getMissingCell(); cells[4] = DataType.getMissingCell(); cells[5] = DataType.getMissingCell(); cells[6] = DataType.getMissingCell(); cells[7] = DataType.getMissingCell(); cells[8] = safe_cell(m_filename); file_container.addRowToTable(new DefaultRow(new RowKey(file_seq.get()), cells)); } /** * Called with the data for a single spectra at a time, this routine must update the * spectra cells and add the row as appropriate. This code is pretty ugly so as to handle * the variability and flexibility in what may or may not be specified in the file. * * @param header * @param peak_list */ protected void process_spectra(String header, String peak_list, int n_peaks, RowSequence sseq, boolean load_spectra, int ncols, DataContainer c) { assert(header != null && peak_list != null); MyMGFPeakList mgf = new MyMGFPeakList(); // 1. process the headers for (String line : header.split("\\n")) { int pos = line.indexOf('='); if (pos >= 0) { String key = line.substring(0, pos); String val = line.substring(pos+1).trim(); mgf.addHeader(key, val); } } // 2. process the peak list double[] mz = new double[n_peaks]; double[] intensity = new double[n_peaks]; int cnt = 0; boolean has_intensity = true; for (String line : peak_list.split("\\n")) { String[] fields = line.split("\\s+"); if (fields.length > 1) { mz[cnt] = Double.parseDouble(fields[0]); intensity[cnt] = Double.parseDouble(fields[1]); } else { has_intensity = false; mz[cnt] = Double.parseDouble(fields[0]); } cnt++; } mgf.setPeaks(mz, has_intensity ? intensity : null); DataCell[] cells = new DataCell[ncols]; for (int i=0; i<ncols; i++) { cells[i] = DataType.getMissingCell(); } cells[21] = new StringCell(m_filename); if (ncols > 23) { cells[23] = SpectraUtilityFactory.createCell(mgf); } cells[22] = new IntCell(mgf.getNumPeaks()); String pepmass = mgf.getPepmass_safe(); if (pepmass != null) cells[13] = new DoubleCell(Double.parseDouble(pepmass)); else cells[13] = DataType.getMissingCell(); String charge = mgf.getCharge_safe(); if (charge != null) { charge = charge.trim().replaceAll("\\+", ""); if (charge.length() > 0) cells[10] = new IntCell(Integer.parseInt(charge)); else cells[10] = DataType.getMissingCell(); } cells[0] = new StringCell(mgf.getTitle_safe()); c.addRowToTable(new DefaultRow(sseq.get(), cells)); } @Override public void setInput(String filename) throws Exception { try { m_is = BufferedFileReader.createNewReader(new FileInputStream(new File(filename))); } catch (Exception e) { m_is = null; NodeLogger.getLogger(SpectraReaderNodeModel.class).warn("Cannot open "+filename+", reason: "+e); } } }